*Dataset with singles*

*Load data*
clear
use "Data\sample1_new.dta"

*Keep only relevant variables
keep pnr aar wage_growth_ambition final_educ wage_start_mean_ambition grad_region educ_eika fined civst faelle_nr koen aegte_nr individual erhvervsindk_13 hfaudd ten_wage_ambition start_wage_ambition individual old_ambition extreme_ambition


forvalues i=81(1)85{
	
*9th grade
gen temp_g=wage_growth_ambition if final_educ==1109`i'
egen temp_g_2=max(temp_g)
replace wage_growth_ambition=temp_g_2 if (final_educ==1007 | final_educ==1008 | final_educ==1023 | final_educ==1123 | final_educ==1009 | final_educ==1022) & grad_region==`i' 
drop temp_g temp_g_2

gen temp_w=wage_start_mean_ambition if final_educ==1109`i'
egen temp_w_2=max(temp_w)
replace wage_start_mean_ambition=temp_w_2 if (final_educ==1007 | final_educ==1008 | final_educ==1023 | final_educ==1123 | final_educ==1009 | final_educ==1022) & grad_region==`i' 
drop temp_w temp_w_2

replace final_educ=1107`i' if final_educ==1107 & grad_region==`i'
replace final_educ=1008`i' if final_educ==1008 & grad_region==`i'
replace final_educ=1023`i' if final_educ==1023 & grad_region==`i'
replace final_educ=1123`i' if final_educ==1123 & grad_region==`i'
replace final_educ=1009`i' if final_educ==1009 & grad_region==`i'
replace final_educ=1022`i' if final_educ==1022 & grad_region==`i'

*10th grade
gen temp_g=wage_growth_ambition if final_educ==1110`i'
egen temp_g_2=max(temp_g)
replace wage_growth_ambition=temp_g_2 if final_educ==1010 & grad_region==`i' 
drop temp_g temp_g_2

gen temp_w=wage_start_mean_ambition if final_educ==1110`i'
egen temp_w_2=max(temp_w)
replace wage_start_mean_ambition=temp_w_2 if final_educ==1010 & grad_region==`i' 
drop temp_w temp_w_2

replace final_educ=1010`i' if final_educ==1010 & grad_region==`i'

}

*3.g 
gen temp_g=wage_growth_ambition if final_educ==1198
egen temp_g_2=max(temp_g)
replace wage_growth_ambition=temp_g_2 if final_educ==1097
drop temp_g temp_g_2

gen temp_w=wage_start_mean_ambition if final_educ==1198
egen temp_w_2=max(temp_w)
replace wage_start_mean_ambition=temp_w_2 if final_educ==1097
drop temp_w temp_w_2

*Merge on field of study codes*
merge 1:1 pnr aar using "Data\field_of_study.dta", keepusing(educ_field)
drop if _merge==2
drop _merge

*Missing fields
replace educ_field="" if educ_field=="98"
replace educ_field="" if educ_field=="99"

*Check size of fields
tab educ_field, sort

*Drop missing and code 15
drop if missing(educ_field)
drop if educ_field=="15"



****kmeans of fields****

**SPLIT FURTHER UP BY LEVELS**

gen educ_field_level=""

foreach i in 60 65 70 20 25 27 30 55 57 58 75 80 10 12 40 35 45 50{
	forvalues j=1(1)4{
replace educ_field_level="`i'`j'" if educ_field=="`i'" & fined==`j'
}
}

tab educ_field_level 

gen temp=educ_field_level if hfaudd==real(substr(string(final_educ),1,4))
sort pnr aar
by pnr: egen final_educ_field_level=mode(temp), maxmode
drop temp

sort final_educ_field_level

*get group variables

by final_educ_field_level: egen temp=sum(ten_wage_ambition) if !missing(final_educ_field_level) & individual==1 & old_ambition==1 & extreme_ambition==0  
replace temp=. if temp==0
by final_educ_field_level: egen wage10_ambition_field_temp=max(temp)
drop temp

by final_educ_field_level: egen temp=count(ten_wage_ambition) if !missing(final_educ_field_level) & individual==1 & old_ambition==1 & extreme_ambition==0  
by final_educ_field_level: egen n_wage10_ambition_field_group=max(temp)
drop temp

by final_educ_field_level: egen temp=sum(start_wage_ambition) if !missing(final_educ_field_level) & individual==1 & old_ambition==1 & extreme_ambition==0  
replace temp=. if temp==0
by final_educ_field_level: egen wage_start_ambition_field_temp=max(temp)
drop temp 

by final_educ_field_level: egen temp=count(start_wage_ambition) if !missing(final_educ_field_level) & individual==1 & old_ambition==1 & extreme_ambition==0  
by final_educ_field_level: egen n_wage_start_ambition_f_group=max(temp)
drop temp

*get individual variables
sort pnr aar

gen wage10_ambition_f=. 
replace wage10_ambition_f=wage10_ambition_field_temp

gen wage_start_ambition_f=.
replace wage_start_ambition_f=wage_start_ambition_field_temp

gen n_wage10_individual_ambition_f=.
replace n_wage10_individual_ambition_f=n_wage10_ambition_field_group 

gen n_wage_start_individual_am_f=.
replace n_wage_start_individual_am_f=n_wage_start_ambition_f_group

gen wage10_mean_ambition_f=wage10_ambition_f/n_wage10_individual_ambition_f if n_wage10_individual_ambition_f>0

gen wage_start_mean_ambition_f=wage_start_ambition_f/n_wage_start_individual_am_f if n_wage_start_individual_am_f>0

*Compute wage growth
gen wage_growth_ambition_f=.
replace wage_growth_ambition_f=exp(wage10_mean_ambition_f-wage_start_mean_ambition_f)-1

*standardize variables
sum wage_start_mean_ambition_f
sca the_mean_s_f=r(mean)
sca the_sd_s_f=r(sd)
gen wage_start_mean_ambition_s_f=(wage_start_mean_ambition_f-the_mean_s_f)/the_sd_s_f
sum wage_start_mean_ambition_s_f

sum wage_growth_ambition_f
sca the_mean_g_f=r(mean)
sca the_sd_g_f=r(sd)
gen wage_growth_ambition_s_f=(wage_growth_ambition_f-the_mean_g_f)/the_sd_g_f
sum wage_growth_ambition_s_f

cluster kmeans wage_start_mean_ambition_s_f wage_growth_ambition_s_f, k(4) name(ambition_type_fields_s) s(kr(1234))
tab ambition_type_fields_s
tabstat wage_start_mean_ambition_s_f wage_growth_ambition_s_f, by(ambition_type_fields_s)

tab ambition_type_fields_s fined, row

*Define marital status
gen relationship=0
replace relationship=1 if civst=="G" | (civst!="G" & faelle_nr!="")

gen married=0
replace married=1 if civst=="G"

gen cohab=0
replace cohab=1 if relationship==1 & married==0

*Make couple ID based on the PNR of the man
gen couple_id="."
replace couple_id=pnr if koen=="1" & relationship==1
replace couple_id=aegte_nr if koen=="2" & married==1
replace couple_id=faelle_nr if koen=="2" & cohab==1
sort couple_id aar

*Keep only couples where we observe both partners
gen temp2=koen if relationship==1
destring temp2, replace
by couple_id aar: egen temp3=mean(temp2)
keep if (temp3>1 & temp3<2 & relationship==1) | relationship==0
drop temp2 temp3



sort pnr aar

**Keep only those for who we observe ambition type**
keep if wage_growth_ambition!=.

*Keep only couples where we observe both partners
gen temp2=koen if relationship==1
destring temp2, replace
by couple_id aar, sort: egen temp3=mean(temp2)
keep if (temp3==1.5 & relationship==1) | relationship==0
drop temp2 temp3

sort pnr aar

*Save*
save "Data\dataset_with_types.dta", replace

